# Imports
import shap
import numpy as np
import time
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
shap.initjs()
# Read data
data = pd.read_csv('Dataset-Unicauca-Version2-87Atts-Clean.csv')
# Rename Labels to DataFlow from previous processing that bucketed data usage
data.rename({"labels": "DataFlow"}, axis=1, inplace=True)
# Label Protocol Names to be used as targets for Random Forest Model
lb_make = LabelEncoder()
data["labels"] = lb_make.fit_transform(data["ProtocolName"])
data[["ProtocolName", "labels"]].head(11)
mapped_label_ids = dict(zip(lb_make.classes_,range(len(lb_make.classes_))))
# Drop columns not needed for modelling
data.drop(columns=["Flow.ID"], inplace=True)
data.drop(columns=["ProtocolName"], inplace=True)
# Split Data to train/test
y = data.iloc[:, -1]
data.drop(columns=data.columns[-1], axis=1, inplace=True)
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.2)
# Prepare Interactive Predictive Label Selector for Visualizations
import ipywidgets as widgets
# Create the list of all labels for the drop down list
list_of_labels = y.unique().tolist()
# Create a list of tuples so that the index of the label is what is returned
tuple_of_labels = list(zip(list_of_labels, range(len(list_of_labels))))
# Create a widget for the labels and then display the widget
current_label = widgets.Dropdown(options=mapped_label_ids,
value=0,
description='Select Label:'
)
# Create a widget for the labels and then display the widget
current_label2 = widgets.Dropdown(options=mapped_label_ids,
value=0,
description='Select Label:'
)
# Function to report accuracy
def print_accuracy(f):
print("Accuracy = {0}%".format(100 * np.sum(f(X_test) == y_test) / len(y_test)))
time.sleep(0.5) # to let the print get out before any progress bars
# Random Forest Explainable (only taking the first 5000 records; the entire dataset takes too long to model)
rforest = RandomForestClassifier(n_estimators=100, max_depth=None, min_samples_split=20, random_state=0)
rforest.fit(X_train.head(5000), y_train.head(5000))
print_accuracy(rforest.predict)
Accuracy = 83.94431554524363%
# explain all the predictions in the test set - NOTE sample may need to be 100
# Use shap.sample on X_train/test like the below because the dataset is too massive; just take a sample
explainer = shap.KernelExplainer(rforest.predict_proba, shap.sample(X_train, nsamples=1000))
shap_values = explainer.shap_values(shap.sample(X_test, nsamples=1000))
Using 1000 background data samples could cause slower run times. Consider using shap.sample(data, K) or shap.kmeans(data, K) to summarize the background as K samples.
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-15-bf5d6947cc28> in <module> 1 explainer = shap.KernelExplainer(rforest.predict_proba, shap.sample(X_train, nsamples=1000)) 2 shap_values = explainer.shap_values(shap.sample(X_test, nsamples=1000)) ----> 3 shap_interaction_values = explainer.shap_interaction_values(shap.sample(X_test, nsamples=1000)) AttributeError: 'Kernel' object has no attribute 'shap_interaction_values'
This is a standard violin plot but with outliers drawn as points. This gives a more accurate representation of the density out the outliers than a kernel density estimated from so few points. The color represents the average feature value at that position, so red regions have mostly high valued feature values while blue regions have mostly low feature values.
current_label
shap.summary_plot(shap_values = shap_values[current_label.value],
features = X_test.iloc[0:1000,:],
plot_type="violin")
By changing the label to something else we can see what features on the left have a higher impact on the model predicting this label. For example, when predicing the Protocol Name of "99TAXI" we see the feature L7Protocol has little importance on predicting this Protocol. However, the feature Forward Packet Length does.
current_label
shap.force_plot(explainer.expected_value[0], shap_values[current_label.value], shap.sample(X_test, nsamples=1000), link='logit')
Again, by changing the predicted label to something else we can see what features are significant and not significant, like from the previous visualization. However, we cal also interact with different features and compairsons from an additional labels on the top and to the left of the visualization. We can see that when both the highlighted features equal a particular value what the other features we are comparing to on top may equal.
current_label
current_label2
shap.dependence_plot((current_label.value, current_label2.value), shap_values[current_label2.value], shap.sample(X_test, nsamples=1000))
This visualization can show the interactions between two different features. If the features were dependent on each other, there would be an inter-mingling of many colored points. Otherwise, the features are not correlated with each other.
# Prepare SHAP values
#explainer = shap.KernelExplainer(rforest.predict_proba, X_train)
#shap_values = explainer.shap_values(shap.sample(X_test, nsamples=1000))
#test1 = shap.TreeExplainer(dtree)
#t = test1.shap_values(shap.sample(X_test, nsamples=1000))
# See all our predicted values from our X_test dataset.
explainer.expected_value
array([2.61567113e-02, 1.67824345e-03, 4.08037996e-04, 3.13699037e-04,
4.06056268e-03, 3.01564465e-03, 9.47937557e-03, 1.89808959e-04,
9.17248445e-03, 1.72129001e-04, 1.24224762e-02, 2.77426349e-01,
5.76155499e-04, 1.75293698e-01, 8.86565280e-02, 1.59998316e-01,
1.54699557e-03, 2.20522014e-04, 1.62389081e-02, 6.53684927e-03,
3.85873984e-04, 2.00937275e-04, 1.72227466e-03, 1.65858785e-04,
8.31043848e-03, 3.49279800e-04, 3.31635328e-04, 1.16458784e-01,
1.28437717e-04, 7.26903341e-03, 3.42807186e-04, 8.41006003e-04,
2.62275241e-04, 9.27036366e-03, 7.88941352e-03, 5.25080873e-02])
# View the features that predicted this label
print("Visualization for predicted label: " + str(int(explainer.expected_value[0])))
shap.force_plot(explainer.expected_value[0], shap_values[35], shap.sample(X_test, nsamples=1000))
Visualization for predicted label: 0
Now interact with the dropdowns at the top to select the feature and see the values of that feature that may predict the label above.